{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9881451c",
   "metadata": {},
   "source": [
    "## Example: Using LiteLLM model to access VLLM server\n",
    "\n",
    "Requirements:\n",
    "- Installed VLLM instance: Follow this [instruction](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#nvidia-cuda)\n",
    "\n",
    "Launch an VLLM instance:\n",
    "```\n",
    "vllm serve Qwen/Qwen3-1.7B --host 0.0.0.0 \\\n",
    "--port 8000 \\\n",
    "--reasoning-parser deepseek_r1 \\\n",
    "--enable-prefix-caching \\\n",
    "--guided-decoding-backend guidance \\\n",
    "--max-model-len 16384\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c21a05fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from pydantic import BaseModel\n",
    "import guidance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d708d02",
   "metadata": {},
   "outputs": [],
   "source": [
    "litellm_desc = {\n",
    "    \"model_name\": \"Qwen/Qwen3-1.7B\",\n",
    "    \"litellm_params\": {  # params for litellm completion/embedding call\n",
    "        \"model\": \"hosted_vllm/Qwen/Qwen3-1.7B\",\n",
    "        \"api_key\": os.environ.get(\"VLLM_API_KEY\", \"NO_KEY\"), # set your vLLM API key if needed\n",
    "        \"api_base\": \"http://localhost:8000/v1\", # change to your vLLM API base URL\n",
    "    },\n",
    "}\n",
    "base_lm = guidance.models.experimental.LiteLLM(model_description=litellm_desc, echo=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d74c4da5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_gen_test(lm):\n",
    "    with guidance.user():\n",
    "        lm += \"What is the capital of France? and its population?\"\n",
    "        lm += \"Format your answer as follows: Capital: <capital>, Population: <population>\"\n",
    "\n",
    "    with guidance.assistant():\n",
    "        lm += guidance.gen(max_tokens=1024, temperature=0.7, name=\"answer\")\n",
    "        print(lm[\"answer\"])\n",
    "\n",
    "run_gen_test(base_lm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62d7022e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_gen_stop_test(lm):\n",
    "    with guidance.user():\n",
    "        lm += \"What is the capital of France? and its population?\"\n",
    "        lm += \"Format your answer as follows: Capital: <capital>, Population: <population>\"\n",
    "        lm += \"Say 'STOP RIGHT THERE' when you are done.\"\n",
    "\n",
    "    with guidance.assistant():\n",
    "        lm += guidance.gen(max_tokens=1024, temperature=0.7, name=\"answer\", stop=[\"STOP\"])\n",
    "        print(lm[\"answer\"])\n",
    "\n",
    "run_gen_stop_test(base_lm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb6de96e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_json_test(lm):\n",
    "    class CityInfo(BaseModel):\n",
    "        capital: str\n",
    "        population: int\n",
    "\n",
    "    with guidance.user():\n",
    "        lm += \"What is the capital of France? and its population? Output as JSON.\"\n",
    "\n",
    "    with guidance.assistant():\n",
    "        lm += guidance.json(schema=CityInfo, name=\"answer\")\n",
    "        print(lm[\"answer\"])\n",
    "\n",
    "run_json_test(base_lm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac9b0afc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_json_object_test(lm):\n",
    "    class CityInfo(BaseModel):\n",
    "        capital: str\n",
    "        population: int\n",
    "\n",
    "    with guidance.user():\n",
    "        lm += \"What is the capital of France? and its population? output json\"\n",
    "\n",
    "    with guidance.assistant():\n",
    "        lm += guidance.json(schema=None, name=\"answer\")  # No schema, just output JSON\n",
    "        print(lm[\"answer\"])\n",
    "\n",
    "run_json_object_test(base_lm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a789fc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_lark_grammar(lm):\n",
    "    lark_grammar = \"\"\"\n",
    "start: \"Capital: \" CAPITAL \", Population: \" INT\n",
    "CAPITAL: /[A-Z][a-z]+/\n",
    "INT: /[0-9]+/\n",
    "\"\"\"\n",
    "\n",
    "    with guidance.user():\n",
    "        lm += \"What is the capital of France? and its population?\"\n",
    "\n",
    "    with guidance.assistant():\n",
    "        lm += guidance.lark(lark_grammar=lark_grammar, name=\"answer\")\n",
    "        print(lm[\"answer\"])\n",
    "\n",
    "run_lark_grammar(base_lm)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "guidance",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
